{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 划分数据集\n",
    "- random sample\n",
    "- isin\n",
    "- ~\n",
    "- cPickle.load(open('','rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import pandas as pd\n",
    "import _pickle as cPickle\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>location</th>\n",
       "      <th>age</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>usa</td>\n",
       "      <td>2</td>\n",
       "      <td>0195153448</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7</td>\n",
       "      <td>usa</td>\n",
       "      <td>0</td>\n",
       "      <td>034542252</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0002005018</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0060973129</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0374157065</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id location  age     item_id  rating\n",
       "0        2      usa    2  0195153448     0.0\n",
       "1        7      usa    0   034542252     0.0\n",
       "2        8   canada    0  0002005018     5.0\n",
       "3        8   canada    0  0060973129     0.0\n",
       "4        8   canada    0  0374157065     0.0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=cPickle.load(\n",
    "    open('data/jxj_data.pkl','rb'))\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1149780 entries, 0 to 1149779\n",
      "Data columns (total 5 columns):\n",
      " #   Column    Non-Null Count    Dtype  \n",
      "---  ------    --------------    -----  \n",
      " 0   user_id   1149780 non-null  int64  \n",
      " 1   location  1149780 non-null  object \n",
      " 2   age       1149780 non-null  int32  \n",
      " 3   item_id   1149780 non-null  object \n",
      " 4   rating    1149772 non-null  float64\n",
      "dtypes: float64(1), int32(1), int64(1), object(2)\n",
      "memory usage: 48.2+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=data.loc[~data.index.isin(data.loc[data['rating'].isna()].index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1149772 entries, 0 to 1149779\n",
      "Data columns (total 5 columns):\n",
      " #   Column    Non-Null Count    Dtype  \n",
      "---  ------    --------------    -----  \n",
      " 0   user_id   1149772 non-null  int64  \n",
      " 1   location  1149772 non-null  object \n",
      " 2   age       1149772 non-null  int32  \n",
      " 3   item_id   1149772 non-null  object \n",
      " 4   rating    1149772 non-null  float64\n",
      "dtypes: float64(1), int32(1), int64(1), object(2)\n",
      "memory usage: 48.2+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.seed(123)\n",
    "test_index=random.sample(data.index.tolist(),int(len(data)*0.3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 344931 entries, 109815 to 473432\n",
      "Data columns (total 5 columns):\n",
      " #   Column    Non-Null Count   Dtype  \n",
      "---  ------    --------------   -----  \n",
      " 0   user_id   344931 non-null  int64  \n",
      " 1   location  344931 non-null  object \n",
      " 2   age       344931 non-null  int32  \n",
      " 3   item_id   344931 non-null  object \n",
      " 4   rating    344931 non-null  float64\n",
      "dtypes: float64(1), int32(1), int64(1), object(2)\n",
      "memory usage: 14.5+ MB\n"
     ]
    }
   ],
   "source": [
    "test_data=data.loc[test_index]\n",
    "test_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "109815     0.0\n",
       "561365     8.0\n",
       "182846     8.0\n",
       "854053     0.0\n",
       "559009     9.0\n",
       "          ... \n",
       "1064723    9.0\n",
       "427492     0.0\n",
       "1022657    0.0\n",
       "22450      0.0\n",
       "473432     0.0\n",
       "Name: rating, Length: 344931, dtype: float64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data['rating'][test_data['rating'].isna().index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>location</th>\n",
       "      <th>age</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>usa</td>\n",
       "      <td>2</td>\n",
       "      <td>0195153448</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7</td>\n",
       "      <td>usa</td>\n",
       "      <td>0</td>\n",
       "      <td>034542252</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0002005018</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0060973129</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8</td>\n",
       "      <td>canada</td>\n",
       "      <td>0</td>\n",
       "      <td>0374157065</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id location  age     item_id  rating\n",
       "0        2      usa    2  0195153448     0.0\n",
       "1        7      usa    0   034542252     0.0\n",
       "2        8   canada    0  0002005018     5.0\n",
       "3        8   canada    0  0060973129     0.0\n",
       "4        8   canada    0  0374157065     0.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data=data.loc[~data.index.isin(test_index)]\n",
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 804841 entries, 0 to 1149779\n",
      "Data columns (total 5 columns):\n",
      " #   Column    Non-Null Count   Dtype  \n",
      "---  ------    --------------   -----  \n",
      " 0   user_id   804841 non-null  int64  \n",
      " 1   location  804841 non-null  object \n",
      " 2   age       804841 non-null  int32  \n",
      " 3   item_id   804841 non-null  object \n",
      " 4   rating    804841 non-null  float64\n",
      "dtypes: float64(1), int32(1), int64(1), object(2)\n",
      "memory usage: 33.8+ MB\n"
     ]
    }
   ],
   "source": [
    "train_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "cPickle.dump(\n",
    "    test_data,\n",
    "open('./data/test_data.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "cPickle.dump(train_data,\n",
    "            open('./data/train_data.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
